In [ ]:
import os

import numpy as np
import pandas as pd
import polars as pl
import random
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.ensemble import VotingRegressor

import lightgbm as lgb
from catboost import CatBoostRegressor

import warnings
# Filter out specific ValueWarnings from statsmodels
warnings.filterwarnings("ignore")
In [ ]:
root = "/kaggle/input/predict-energy-behavior-of-prosumers"

## Define important columns
data_cols = ['target', 'county', 'is_business', 'product_type', 'is_consumption', 'datetime', 'row_id']
client_cols = ['product_type', 'county', 'eic_count', 'installed_capacity', 'is_business', 'date']
gas_prices_cols = ['forecast_date', 'lowest_price_per_mwh', 'highest_price_per_mwh']
electricity_prices_cols = ['forecast_date', 'euros_per_mwh']
forecast_weather_cols = ['latitude', 'longitude', 'hours_ahead', 'temperature', 'dewpoint', 'cloudcover_high', 'cloudcover_low', 'cloudcover_mid', 'cloudcover_total', '10_metre_u_wind_component', '10_metre_v_wind_component', 'forecast_datetime', 'direct_solar_radiation', 'surface_solar_radiation_downwards', 'snowfall', 'total_precipitation']
historical_weather_cols = ['datetime', 'temperature', 'dewpoint', 'rain', 'snowfall', 'surface_pressure','cloudcover_total','cloudcover_low','cloudcover_mid','cloudcover_high','windspeed_10m','winddirection_10m','shortwave_radiation','direct_solar_radiation','diffuse_radiation','latitude','longitude']
location_cols = ['longitude', 'latitude', 'county']
target_cols = ['target', 'county', 'is_business', 'product_type', 'is_consumption', 'datetime']

## Importing only specified columns
df_data = pl.read_csv(os.path.join(root, "train.csv"), columns=data_cols, try_parse_dates=True)
df_client = pl.read_csv(os.path.join(root, "client.csv"), columns=client_cols, try_parse_dates=True)
df_gas_prices = pl.read_csv(os.path.join(root, "gas_prices.csv"), columns=gas_prices_cols, try_parse_dates=True)
df_electricity_prices = pl.read_csv(os.path.join(root, "electricity_prices.csv"), columns=electricity_prices_cols, try_parse_dates=True)
df_forecast_weather = pl.read_csv(os.path.join(root, "forecast_weather.csv"), columns=forecast_weather_cols, try_parse_dates=True)
df_historical_weather = pl.read_csv(os.path.join(root, "historical_weather.csv"), columns=historical_weather_cols, try_parse_dates=True)
df_weather_station_to_county_mapping = pl.read_csv(os.path.join(root, "weather_station_to_county_mapping.csv"), columns=location_cols, try_parse_dates=True)
df_target = df_data.select(target_cols)

## Prepare schema for api submission
schema_data = df_data.schema
schema_client = df_client.schema
schema_gas  = df_gas_prices.schema
schema_electricity = df_electricity_prices.schema
schema_forecast = df_forecast_weather.schema
schema_historical = df_historical_weather.schema
schema_target = df_target.schema
In [ ]:
def generate_features(df_data, df_client, df_gas_prices, df_electricity_prices, df_forecast_weather, df_historical_weather, df_weather_station_to_county_mapping, df_target):
    df_data = (
        df_data.with_columns(pl.col("datetime").cast(pl.Date).alias("date"),)
    )
    
    df_gas_prices = (
        df_gas_prices.rename({"forecast_date": "date"})
    )
    
    df_electricity_prices = (
        df_electricity_prices.rename({"forecast_date": "datetime"})
    )
    
    df_weather_station_to_county_mapping = (
        df_weather_station_to_county_mapping.with_columns(pl.col("latitude").cast(pl.datatypes.Float32),pl.col("longitude").cast(pl.datatypes.Float32))
    )
    
    # sum of all product_type targets related to ["datetime", "county", "is_business", "is_consumption"]
    df_target_all_type_sum = (
        df_target.group_by(["datetime", "county", "is_business", "is_consumption"]).sum().drop("product_type")
    )
    
    df_forecast_weather = (
        df_forecast_weather.rename({"forecast_datetime": "datetime"}).filter(pl.col("hours_ahead") >= 24) # we don't need forecast for today
        .with_columns(pl.col("latitude").cast(pl.datatypes.Float32),pl.col("longitude").cast(pl.datatypes.Float32),
            # datetime for forecast in a different timezone
            pl.col('datetime').dt.replace_time_zone(None).cast(pl.Datetime("us")),
        )
        .join(df_weather_station_to_county_mapping, how="left", on=["longitude", "latitude"]).drop("longitude", "latitude")
    )
    
    df_historical_weather = (
        df_historical_weather
        .with_columns(pl.col("latitude").cast(pl.datatypes.Float32),pl.col("longitude").cast(pl.datatypes.Float32),
        )
        .join(df_weather_station_to_county_mapping, how="left", on=["longitude", "latitude"]).drop("longitude", "latitude")
    )
    
    # creating average forecast characteristics for all weather stations
    df_forecast_weather_date = (
        df_forecast_weather.group_by("datetime").mean().drop("county")
    )
    
    # creating average forecast characteristics for weather stations related to county
    df_forecast_weather_local = (
        df_forecast_weather.filter(pl.col("county").is_not_null()).group_by("county", "datetime").mean()
    )
    
    # creating average historical characteristics for all weather stations
    df_historical_weather_date = (
        df_historical_weather.group_by("datetime").mean().drop("county")
    )
    
    # creating average historical characteristics for weather stations related to county
    df_historical_weather_local = (
        df_historical_weather.filter(pl.col("county").is_not_null()).group_by("county", "datetime").mean()
    )
    
    df_data = (
        df_data
        # pl.duration(days=1) shifts datetime to join lag features (usually we join last available values)
        .join(df_gas_prices.with_columns((pl.col("date") + pl.duration(days=1)).cast(pl.Date)), on="date", how="left")
        .join(df_client.with_columns((pl.col("date") + pl.duration(days=2)).cast(pl.Date)), on=["county", "is_business", "product_type", "date"], how="left")
        .join(df_electricity_prices.with_columns(pl.col("datetime") + pl.duration(days=1)), on="datetime", how="left")
        
        # lag forecast_weather features (24 hours * days)
        .join(df_forecast_weather_date, on="datetime", how="left", suffix="_fd")
        .join(df_forecast_weather_local, on=["county", "datetime"], how="left", suffix="_fl")
        .join(df_forecast_weather_date.with_columns(pl.col("datetime") + pl.duration(days=7)), on="datetime", how="left", suffix="_fd_7d")
        .join(df_forecast_weather_local.with_columns(pl.col("datetime") + pl.duration(days=7)), on=["county", "datetime"], how="left", suffix="_fl_7d")

        # lag historical_weather features (24 hours * days)
        .join(df_historical_weather_date.with_columns(pl.col("datetime") + pl.duration(days=2)), on="datetime", how="left", suffix="_hd_2d")
        .join(df_historical_weather_local.with_columns(pl.col("datetime") + pl.duration(days=2)), on=["county", "datetime"], how="left", suffix="_hl_2d")
        .join(df_historical_weather_date.with_columns(pl.col("datetime") + pl.duration(days=7)), on="datetime", how="left", suffix="_hd_7d")
        .join(df_historical_weather_local.with_columns(pl.col("datetime") + pl.duration(days=7)), on=["county", "datetime"], how="left", suffix="_hl_7d")
        
        # lag target features (24 hours * days)
        .join(df_target.with_columns(pl.col("datetime") + pl.duration(days=2)).rename({"target": "target_1"}), on=["county", "is_business", "product_type", "is_consumption", "datetime"], how="left")
        .join(df_target.with_columns(pl.col("datetime") + pl.duration(days=3)).rename({"target": "target_2"}), on=["county", "is_business", "product_type", "is_consumption", "datetime"], how="left")
        .join(df_target.with_columns(pl.col("datetime") + pl.duration(days=4)).rename({"target": "target_3"}), on=["county", "is_business", "product_type", "is_consumption", "datetime"], how="left")
        .join(df_target.with_columns(pl.col("datetime") + pl.duration(days=5)).rename({"target": "target_4"}), on=["county", "is_business", "product_type", "is_consumption", "datetime"], how="left")
        .join(df_target.with_columns(pl.col("datetime") + pl.duration(days=6)).rename({"target": "target_5"}), on=["county", "is_business", "product_type", "is_consumption", "datetime"], how="left")
        .join(df_target.with_columns(pl.col("datetime") + pl.duration(days=7)).rename({"target": "target_6"}), on=["county", "is_business", "product_type", "is_consumption", "datetime"], how="left")
        .join(df_target.with_columns(pl.col("datetime") + pl.duration(days=14)).rename({"target": "target_7"}), on=["county", "is_business", "product_type", "is_consumption", "datetime"], how="left")
        
        .join(df_target_all_type_sum.with_columns(pl.col("datetime") + pl.duration(days=2)).rename({"target": "target_1"}), on=["county", "is_business", "is_consumption", "datetime"], suffix="_all_type_sum", how="left")
        .join(df_target_all_type_sum.with_columns(pl.col("datetime") + pl.duration(days=3)).rename({"target": "target_2"}), on=["county", "is_business", "is_consumption", "datetime"], suffix="_all_type_sum", how="left")
        .join(df_target_all_type_sum.with_columns(pl.col("datetime") + pl.duration(days=7)).rename({"target": "target_6"}), on=["county", "is_business", "is_consumption", "datetime"], suffix="_all_type_sum", how="left")
        .join(df_target_all_type_sum.with_columns(pl.col("datetime") + pl.duration(days=14)).rename({"target": "target_7"}), on=["county", "is_business", "is_consumption", "datetime"], suffix="_all_type_sum", how="left")
        
        
        .with_columns(
            pl.col("datetime").dt.ordinal_day().alias("dayofyear"),
            pl.col("datetime").dt.hour().alias("hour"),
            pl.col("datetime").dt.day().alias("day"),
            pl.col("datetime").dt.weekday().alias("weekday"),
            pl.col("datetime").dt.month().alias("month"),
            pl.col("datetime").dt.year().alias("year"),
        )
        .with_columns(
            pl.concat_str("county", "is_business", "product_type", "is_consumption", separator="_").alias("segment"),
        )
        # cyclical features encoding https://towardsdatascience.com/cyclical-features-encoding-its-about-time-ce23581845ca
        .with_columns(
            (np.pi * pl.col("dayofyear") / 183).sin().alias("sin(dayofyear)"),
            (np.pi * pl.col("dayofyear") / 183).cos().alias("cos(dayofyear)"),
            (np.pi * pl.col("hour") / 12).sin().alias("sin(hour)"),
            (np.pi * pl.col("hour") / 12).cos().alias("cos(hour)"),
        )
        .with_columns(
            pl.col(pl.Float64).cast(pl.Float32),
        )
        
        .drop("datetime", "hour", "dayofyear")
    )
    
    return df_data
In [ ]:
## Function defined for merging dataframes with the row_id as unique identifier
def to_pandas(X, y=None):
    cat_cols = ["county", "is_business", "product_type", "is_consumption", "segment"]
    
    if y is not None:
        df = pd.concat([X.to_pandas(), y.to_pandas()], axis=1)
    else:
        df = X.to_pandas()    
    
    df = df.set_index("row_id")
    df[cat_cols] = df[cat_cols].astype("category")
    
    df["target_mean"] = df[[f"target_{i}" for i in range(1, 7)]].mean(1)
    df["target_std"] = df[[f"target_{i}" for i in range(1, 7)]].std(1)
    df["target_ratio"] = df["target_6"] / (df["target_7"] + 1e-3)
    
    return df
In [ ]:
df_data, y = df_data.drop("target"), df_data.select("target")

df_train_features = generate_features(df_data, df_client, df_gas_prices, df_electricity_prices, df_forecast_weather, df_historical_weather, df_weather_station_to_county_mapping, df_target)

df_train_features = to_pandas(df_train_features, y)
# a little proportion of target values are null
df_train_features = df_train_features[df_train_features['target'].notnull()]

# filter old data
df_train_features = df_train_features[df_train_features.year >= 2022]

df_train_features.head()
In [ ]:
import matplotlib.dates as mdates

# Filter the dataset for the specific segment
consumption_0_0_1_1 = df_train_features[df_train_features.segment == '0_0_1_1']

plt.figure(figsize=(12, 6))
sns.lineplot(x='date', y='target', data=consumption_0_0_1_1, color='blue', linewidth=1.5)

plt.title('Target Over Time for Segment 0_0_1_1', fontsize=14)
plt.xlabel('Date', fontsize=12)
plt.ylabel('Target', fontsize=12)
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m-%d'))  # Format the date display
plt.gca().xaxis.set_major_locator(mdates.DayLocator(interval=15))  # Adjust the interval for better readability
plt.xticks(rotation=45)  # Rotate x-axis labels for clarity

# Show the plot
plt.tight_layout()
plt.show()
In [ ]:
segment_list = df_train_features.segment.unique()[:10]
example_df = df_train_features[np.isin(df_train_features.segment,segment_list)]

# Periods now in hours
periods_in_hours = {
    'Quarterly': (365 / 4) * 24,
    'Monthly': 30 * 24,
    'Weekly': 7 * 24,
    'Daily': 24,
    '12-hour': 12,
    '8-hour': 8,
    '6-hour': 6,
    '4-hour': 4
}
frequencies_for_periods = {k: 1/v for k, v in periods_in_hours.items()}

# Plot adjustments remain mostly the same
plt.figure(figsize=(14, 10))
plt.xscale('log')

for i, segment in enumerate(example_df['segment'].unique()):
    segment_data = example_df[example_df['segment'] == segment]['target']
    fft_values = np.fft.fft(segment_data)
    frequencies = np.fft.fftfreq(len(fft_values), d=1)  # d=1 because data is hourly
    magnitudes = np.abs(fft_values)[frequencies > 0]
    normalized_magnitudes = magnitudes / np.max(magnitudes)
    positive_freqs = frequencies[frequencies > 0]

    # Adjust the filtering based on updated frequencies for periods
    valid_freqs = positive_freqs[positive_freqs > frequencies_for_periods['Quarterly']]
    valid_magnitudes = normalized_magnitudes[positive_freqs > frequencies_for_periods['Quarterly']]

    offset_magnitudes = valid_magnitudes + i  # Offset for clarity

    plt.plot(valid_freqs, offset_magnitudes, label=f'Segment {segment}')

plt.title('Frequency Spectra of hourly target for Each Segment')
plt.xlabel('Frequency')
plt.ylabel('Normalized Magnitude + Offset')
plt.xticks(list(frequencies_for_periods.values()), list(frequencies_for_periods.keys()))
plt.legend()
plt.show()
In [ ]:
# Initialize the figure for the spectra
plt.figure(figsize=(7, 5))

# Convert the x-axis to a log scale
plt.xscale('log')

# Plot the spectrum for each segment with offset
segment_data = example_df[example_df['segment'] == '0_0_1_1']['temperature']
fft_values = np.fft.fft(segment_data)
frequencies = np.fft.fftfreq(len(fft_values), d=1)
magnitudes = np.abs(fft_values)[frequencies > 0]
positive_freqs = frequencies[frequencies > 0]

plt.plot(positive_freqs, magnitudes, label='0_0_1_1')

# Customize the plot
plt.title('Frequency spectrum of Temperature for Segment 0_0_1_1')
plt.xlabel('Frequency')
plt.ylabel('Magnitude')

# Set the x-axis ticks and labels to the calculated frequencies for the periods
plt.xticks(list(frequencies_for_periods.values()), list(frequencies_for_periods.keys()))
plt.xticks(rotation=45)

plt.legend()
plt.show()
In [ ]:
# Initialize the figure for the spectra
plt.figure(figsize=(7, 5))

# Convert the x-axis to a log scale
plt.xscale('log')

# Plot the spectrum for each segment with offset
segment_data = example_df[example_df['segment'] == '0_0_1_1']['direct_solar_radiation']
fft_values = np.fft.fft(segment_data)
frequencies = np.fft.fftfreq(len(fft_values), d=1)
magnitudes = np.abs(fft_values)[frequencies > 0]
positive_freqs = frequencies[frequencies > 0]

plt.plot(positive_freqs, magnitudes, label='0_0_1_1')

# Customize the plot
plt.title('Frequency spectrum of Direct Solar Radiation for Segment 0_0_1_1')
plt.xlabel('Frequency')
plt.ylabel('Magnitude')

# Set the x-axis ticks and labels to the calculated frequencies for the periods
plt.xticks(list(frequencies_for_periods.values()), list(frequencies_for_periods.keys()))
plt.xticks(rotation=45)

plt.legend()
plt.show()
In [ ]:
# Initialize the figure for the spectra
plt.figure(figsize=(7, 5))

# Convert the x-axis to a log scale
plt.xscale('log')

# Plot the spectrum for each segment with offset
segment_data = example_df[example_df['segment'] == '0_0_1_1']['hours_ahead']
fft_values = np.fft.fft(segment_data)
frequencies = np.fft.fftfreq(len(fft_values), d=1)
magnitudes = np.abs(fft_values)[frequencies > 0]
positive_freqs = frequencies[frequencies > 0]

plt.plot(positive_freqs, magnitudes, label='0_0_1_1')

# Customize the plot
plt.title('Frequency spectrum of hours_ahead for Segment 0_0_1_1')
plt.xlabel('Frequency')
plt.ylabel('Magnitude')

# Set the x-axis ticks and labels to the calculated frequencies for the periods
plt.xticks(list(frequencies_for_periods.values()), list(frequencies_for_periods.keys()))
plt.xticks(rotation=45)

plt.legend()
plt.show()
In [ ]:
# Initialize the figure for the spectra
plt.figure(figsize=(7, 5))

# Convert the x-axis to a log scale
plt.xscale('log')

# Plot the spectrum for each segment with offset
segment_data = example_df[example_df['segment'] == '0_0_1_1']['highest_price_per_mwh']
fft_values = np.fft.fft(segment_data)
frequencies = np.fft.fftfreq(len(fft_values), d=1)
magnitudes = np.abs(fft_values)[frequencies > 0]
positive_freqs = frequencies[frequencies > 0]

plt.plot(positive_freqs, magnitudes, label='0_0_1_1')

# Customize the plot
plt.title('Frequency spectrum of Highest Gas Price for Segment 0_0_1_1')
plt.xlabel('Frequency')
plt.ylabel('Magnitude')

# Set the x-axis ticks and labels to the calculated frequencies for the periods
plt.xticks(list(frequencies_for_periods.values()), list(frequencies_for_periods.keys()))
plt.xticks(rotation=45)

plt.legend()
plt.show()
In [ ]:
segment_list = df_train_features.segment.unique()[:10]
example_df = df_train_features[np.isin(df_train_features.segment,segment_list)]

# Initialize the figure for the spectra
plt.figure(figsize=(14, 10))

# Convert the x-axis to a log scale
plt.xscale('log')

# Plot the spectrum for each segment with offset
for i, segment in enumerate(example_df['segment'].unique()):
    segment_data = example_df[example_df['segment'] == segment].groupby('date')['target'].mean()
    fft_values = np.fft.fft(segment_data)
    frequencies = np.fft.fftfreq(len(fft_values), d=1)
    magnitudes = np.abs(fft_values)[frequencies > 0]
    normalized_magnitudes = magnitudes / np.max(magnitudes)
    positive_freqs = frequencies[frequencies > 0]

    # Offset each segment's spectrum for clarity
    offset_magnitudes = normalized_magnitudes + i

    plt.plot(positive_freqs, offset_magnitudes, label=f'Segment {segment}')

# Customize the plot
plt.title('Frequency Spectra of daily averaged target for Each Segment')
plt.xlabel('Frequency')
plt.ylabel('Normalized Magnitude + Offset')

# Set the x-axis ticks and labels to the calculated frequencies for the periods
plt.xticks(list(frequencies_for_periods.values()), list(frequencies_for_periods.keys()))

plt.legend()
plt.show()
In [ ]:
from numpy.linalg import svd, matrix_rank

class SSA:
    def __init__(self, series, window_size):
        self.series = np.array(series)
        self.window_size = window_size
        self.n = len(series)
        self.trajectory_matrix = self.create_trajectory_matrix()

    def create_trajectory_matrix(self):
        n_vectors = self.n - self.window_size + 1
        return np.column_stack([self.series[i:i+self.window_size] for i in range(n_vectors)])

    def decompose(self):
        U, sigma, Vt = svd(self.trajectory_matrix)
        self.rank = matrix_rank(self.trajectory_matrix)
        return U, sigma, Vt

    def reconstruct(self, U, sigma, Vt, components):
        rank = len(components)
        if rank > self.rank:
            raise ValueError("Number of components cannot be greater than the rank of the trajectory matrix")

        series_components = []
        for i in components:
            X_i = sigma[i] * np.outer(U[:,i], Vt[i,:])
            series_components.append(self.diagonal_averaging(X_i))
        
        return sum(series_components)

    def diagonal_averaging(self, X_i):
        n_rows, n_cols = X_i.shape
        diagonals = [X_i[::-1, :].diagonal(i) for i in range(-X_i.shape[0]+1, X_i.shape[1])]
        return np.array([np.mean(diagonal) for diagonal in diagonals])
    
    def calculate_w_correlation(self):
        n = self.window_size
        N = self.n
        K = N - n + 1
        w = np.array([min(i+1, n, N-i) for i in range(N)])
        w_corr_matrix = np.zeros((self.rank, self.rank))

        for i in range(self.rank):
            for j in range(self.rank):
                if i <= j:
                    F_i = self.reconstruct(U, sigma, Vt, [i])
                    F_j = self.reconstruct(U, sigma, Vt, [j])
                    w_corr_matrix[i, j] = np.sum(w * F_i * F_j) / np.sqrt(np.sum(w * F_i ** 2) * np.sum(w * F_j ** 2))
                    w_corr_matrix[j, i] = w_corr_matrix[i, j]
        
        return w_corr_matrix

    def plot_w_correlation(self, U, sigma, Vt):
        w_corr_matrix = self.calculate_w_correlation()
        plt.imshow(w_corr_matrix, cmap='hot', interpolation='nearest')
        plt.colorbar()
        plt.title('W-Correlation Matrix')
        plt.xlabel('Component')
        plt.ylabel('Component')
        plt.show()
In [ ]:
# Group by 'date' and calculate the mean of 'target' for each day
daily_avg = consumption_0_0_1_1.groupby('date')['target'].mean()

# Decompose the time series into 200 components, set by window_size
window_size = 200
ssa = SSA(daily_avg, window_size)
U, sigma, Vt = ssa.decompose()

# Reconstruct the series using the first 10 components
reconstructed_series = ssa.reconstruct(U, sigma, Vt, components=list(range(10)))

# Plot the original and reconstructed series
plt.figure(figsize=(12, 6))
plt.plot(daily_avg.index, daily_avg.values, alpha=0.5, label='Original Daily Average Series')
plt.plot(daily_avg.index, reconstructed_series, label='Reconstructed Series')
plt.title('Original vs Reconstructed Series')
plt.xlabel('Date')
plt.ylabel('Average Target')
plt.legend()
plt.show()

# Subtract the reconstructed series from the original series
residual_series = daily_avg - reconstructed_series

# Plot the residual series
plt.figure(figsize=(12, 6))
plt.plot(daily_avg.index, residual_series, label='Residual Series')
plt.title('Residual Series after Subtracting Reconstructed Components')
plt.xlabel('Date')
plt.ylabel('Residual Target Value')
plt.legend()
plt.show()
In [ ]:
plt.figure(figsize=(15, 20))
for i in range(10):
    plt.subplot(10, 1, i+1)
    plt.plot(Vt[i, :], label=f'Component {i+1}')
    plt.title(f'Mode Shape of Component {i+1}')
plt.tight_layout()
plt.show()
In [ ]:
ssa.plot_w_correlation(U, sigma, Vt)
In [ ]:
df_train_features = df_train_features.drop('date', axis=1)
In [ ]:
# import lightgbm as lgb
# import random
# from sklearn.model_selection import RandomizedSearchCV
# from scipy.stats import randint as sp_randint
# from scipy.stats import uniform as sp_uniform

# def tune_lgbm_model(X_train, y_train, random_state, n_iter=50, cv=3):
#     param_dist = {
#         'path_smooth': sp_uniform(0.01, 0.09),
#         'colsample_bytree': sp_uniform(0.1, 0.9),
#         'colsample_bynode': sp_uniform(0.1, 0.9),
#         'reg_alpha': sp_uniform(1, 9), 
#         'reg_lambda': sp_uniform(1, 9), 
#         'num_leaves': sp_randint(31, 500),
#         'min_child_samples': sp_randint(30, 250),
#         'max_depth': sp_randint(1, 29),
#         'learning_rate': sp_uniform(0.001, 0.09),
#         'objective': ['regression', 'regression_l1', 'huber', 'fair', 'poisson', 'quantile', 'mape', 'gamma', 'tweedie']
#     }

#     lgb_reg = lgb.LGBMRegressor()

#     random_search = RandomizedSearchCV(
#         estimator=lgb_reg, 
#         param_distributions=param_dist,
#         n_iter=n_iter, 
#         scoring='neg_mean_absolute_error',
#         cv=cv, 
#         verbose=0, 
#         random_state=random_state
#     )

#     random_search.fit(X_train, y_train)

#     return random_search.best_params_

# random_states = random.sample(range(1, 100), 10)

# mask = df_train_features['is_consumption'] == 1
# X_train = df_train_features[mask].drop(columns=["target"])
# y_train = df_train_features[mask]["target"]

# parameter_list = []

# for rs in random_states:
#     best_params = tune_lgbm_model(X_train, y_train, rs)
#     best_params_with_fixed = {
#         'num_iterations': 500,
#         'verbose': -1,
#         **best_params
#     }
#     parameter_list.append(best_params_with_fixed)
#     print(f"Best parameters for random state {rs}: {best_params_with_fixed}")
In [ ]:
parameter_list = [{'num_iterations': 500, 'verbose': -1, 'colsample_bynode': 0.7103688779357847, 'colsample_bytree': 0.6759995481832958, 'learning_rate': 0.08107710302284142, 'max_depth': 14, 'min_child_samples': 175, 'num_leaves': 169, 'objective': 'tweedie', 'path_smooth': 0.03638766162304646, 'reg_alpha': 4.241310552615537, 'reg_lambda': 1.91388003514709}, {'num_iterations': 500, 'verbose': -1, 'colsample_bynode': 0.5762487113184288, 'colsample_bytree': 0.5811355426944578, 'learning_rate': 0.08597630012152205, 'max_depth': 26, 'min_child_samples': 194, 'num_leaves': 361, 'objective': 'regression', 'path_smooth': 0.03465551777589707, 'reg_alpha': 6.996661462358405, 'reg_lambda': 8.470525016492395}, {'num_iterations': 500, 'verbose': -1, 'colsample_bynode': 0.43517350965256685, 'colsample_bytree': 0.7831532979997649, 'learning_rate': 0.07025903099500738, 'max_depth': 23, 'min_child_samples': 211, 'num_leaves': 321, 'objective': 'regression', 'path_smooth': 0.08559566561454247, 'reg_alpha': 2.6435598455478146, 'reg_lambda': 4.51993047978816}, {'num_iterations': 500, 'verbose': -1, 'colsample_bynode': 0.6069746474662208, 'colsample_bytree': 0.47723286276461496, 'learning_rate': 0.06380390287538754, 'max_depth': 10, 'min_child_samples': 245, 'num_leaves': 192, 'objective': 'regression', 'path_smooth': 0.03607590859625657, 'reg_alpha': 3.3788113950113328, 'reg_lambda': 5.702284030317997}, {'num_iterations': 500, 'verbose': -1, 'colsample_bynode': 0.360239888533631, 'colsample_bytree': 0.8355751080647349, 'learning_rate': 0.07606039804311976, 'max_depth': 16, 'min_child_samples': 121, 'num_leaves': 478, 'objective': 'regression', 'path_smooth': 0.0997680881977257, 'reg_alpha': 3.734520254197353, 'reg_lambda': 9.635057034367291}, {'num_iterations': 500, 'verbose': -1, 'colsample_bynode': 0.4174740961644613, 'colsample_bytree': 0.7508888306232886, 'learning_rate': 0.07708410483102476, 'max_depth': 25, 'min_child_samples': 216, 'num_leaves': 380, 'objective': 'regression', 'path_smooth': 0.06390932048435091, 'reg_alpha': 3.326634605594462, 'reg_lambda': 4.437207873684814}, {'num_iterations': 500, 'verbose': -1, 'colsample_bynode': 0.7381382697935609, 'colsample_bytree': 0.523576726896179, 'learning_rate': 0.06981333132164486, 'max_depth': 11, 'min_child_samples': 126, 'num_leaves': 214, 'objective': 'tweedie', 'path_smooth': 0.07218072257652315, 'reg_alpha': 9.975905654063325, 'reg_lambda': 2.551064575107957}, {'num_iterations': 500, 'verbose': -1, 'colsample_bynode': 0.9235361356621526, 'colsample_bytree': 0.43075258164239827, 'learning_rate': 0.06755536733597006, 'max_depth': 28, 'min_child_samples': 243, 'num_leaves': 213, 'objective': 'regression', 'path_smooth': 0.020429341100388392, 'reg_alpha': 2.553972315959381, 'reg_lambda': 2.4041022554705522}, {'num_iterations': 500, 'verbose': -1, 'colsample_bynode': 0.44166912467574293, 'colsample_bytree': 0.6606565063118934, 'learning_rate': 0.0897868781748779, 'max_depth': 18, 'min_child_samples': 240, 'num_leaves': 42, 'objective': 'tweedie', 'path_smooth': 0.01116875211972208, 'reg_alpha': 2.4423874109911345, 'reg_lambda': 4.128799810939097}, {'num_iterations': 500, 'verbose': -1, 'colsample_bynode': 0.9525732017313245, 'colsample_bytree': 0.6382630603007415, 'learning_rate': 0.08270391387009293, 'max_depth': 12, 'min_child_samples': 243, 'num_leaves': 292, 'objective': 'tweedie', 'path_smooth': 0.08723701882589538, 'reg_alpha': 6.1157717164194905, 'reg_lambda': 6.675900725606523}]
In [ ]:
model_consumption = VotingRegressor([
        ('lgb_0', lgb.LGBMRegressor(**parameter_list[0], random_state=42)),
        ('lgb_1', lgb.LGBMRegressor(**parameter_list[1], random_state=42)),
        ('lgb_2', lgb.LGBMRegressor(**parameter_list[2], random_state=42)), 
        ('lgb_3', lgb.LGBMRegressor(**parameter_list[3], random_state=42)), 
        ('lgb_4', lgb.LGBMRegressor(**parameter_list[4], random_state=42)), 
        ('lgb_5', lgb.LGBMRegressor(**parameter_list[5], random_state=42)), 
        ('lgb_6', lgb.LGBMRegressor(**parameter_list[6], random_state=42)), 
        ('lgb_7', lgb.LGBMRegressor(**parameter_list[7], random_state=42)),
        ('lgb_8', lgb.LGBMRegressor(**parameter_list[8], random_state=42)),
        ('lgb_9', lgb.LGBMRegressor(**parameter_list[9], random_state=42)),
],weights=[0.14,0.13,0.08,0.11,0.09,0.1,0.09,0.07,0.12,0.07])
    
model_production = VotingRegressor([
        ('lgb_10', lgb.LGBMRegressor(**parameter_list[0], random_state=42)),
        ('lgb_11', lgb.LGBMRegressor(**parameter_list[1], random_state=42)),
        ('lgb_12', lgb.LGBMRegressor(**parameter_list[2], random_state=42)), 
        ('lgb_13', lgb.LGBMRegressor(**parameter_list[3], random_state=42)), 
        ('lgb_14', lgb.LGBMRegressor(**parameter_list[4], random_state=42)), 
        ('lgb_15', lgb.LGBMRegressor(**parameter_list[5], random_state=42)), 
        ('lgb_16', lgb.LGBMRegressor(**parameter_list[6], random_state=42)), 
        ('lgb_17', lgb.LGBMRegressor(**parameter_list[7], random_state=42)),
        ('lgb_18', lgb.LGBMRegressor(**parameter_list[8], random_state=42)),
        ('lgb_19', lgb.LGBMRegressor(**parameter_list[9], random_state=42)),
],weights=[0.14,0.13,0.08,0.11,0.09,0.1,0.09,0.07,0.12,0.07])

mask = df_train_features['is_consumption'] == 1
model_consumption.fit(
    X=df_train_features[mask].drop(columns=["target"]),
    y=df_train_features[mask]["target"]
)

mask = df_train_features['is_consumption'] == 0
model_production.fit(
    X=df_train_features[mask].drop(columns=["target"]),
    y=df_train_features[mask]["target"]
)
In [ ]:
import enefit
env = enefit.make_env()
iter_test = env.iter_test()
In [ ]:
for (
    test, 
    revealed_targets, 
    client, 
    historical_weather,
    forecast_weather, 
    electricity_prices, 
    gas_prices, 
    sample_prediction
) in iter_test:
    
    test = test.rename(columns={"prediction_datetime": "datetime"})
    
    df_test = pl.from_pandas(test[data_cols[1:]], schema_overrides=schema_data)
    df_client = pl.from_pandas(client[client_cols], schema_overrides=schema_client)
    df_gas_prices = pl.from_pandas(gas_prices[gas_prices_cols], schema_overrides=schema_gas)
    df_electricity_prices = pl.from_pandas(electricity_prices[electricity_prices_cols], schema_overrides=schema_electricity)
    df_new_forecast_weather = pl.from_pandas(forecast_weather[forecast_weather_cols], schema_overrides=schema_forecast)
    df_new_historical_weather = pl.from_pandas(historical_weather[historical_weather_cols], schema_overrides=schema_historical)
    df_new_target = pl.from_pandas(revealed_targets[target_cols], schema_overrides=schema_target)
    
    df_forecast_weather = pl.concat([df_forecast_weather, df_new_forecast_weather]).unique(['forecast_datetime', 'latitude', 'longitude', 'hours_ahead'])
    df_historical_weather = pl.concat([df_historical_weather, df_new_historical_weather]).unique(['datetime', 'latitude', 'longitude'])
    df_target = pl.concat([df_target, df_new_target]).unique(['datetime', 'county', 'is_business', 'product_type', 'is_consumption'])
    
    df_test_features = generate_features(
        df_test, 
        df_client, 
        df_gas_prices, 
        df_electricity_prices, 
        df_forecast_weather, 
        df_historical_weather, 
        df_weather_station_to_county_mapping, 
        df_target
    )
    df_test_features = to_pandas(df_test_features)
    df_test_features = df_test_features.drop('date',axis=1)
    
    mask = df_test_features['is_consumption'] == 1
    # clip method makes values < 0 equal 0 because our target is nonnegative and models can produce negative values
    sample_prediction.loc[mask.values, "target"] = model_consumption.predict(df_test_features[mask]).clip(0)
    
    mask = df_test_features['is_consumption'] == 0
    sample_prediction.loc[mask.values, "target"] = model_production.predict(df_test_features[mask]).clip(0)
    
    # send predictions
    env.predict(sample_prediction)